Types of basic graphics and variant on ggplot

Scatterplots

We load the cars dataset and plot it.

data(cars)
help(cars)
head(cars)
##   speed dist
## 1     4    2
## 2     4   10
## 3     7    4
## 4     7   22
## 5     8   16
## 6     9   10
plot(cars)

Load the ggplot library for observing how it improves the R’s basic graphics:

# install.packages("ggplot2") # Install if it doesn't already be installed 
library(ggplot2)

ggplot(cars, aes(x=speed, y=dist)) + # It loads the data and set the variables
  geom_point(shape=1) # It sets the points like hollow circles

ggplot(cars, aes(x=speed, y=dist)) + 
  geom_point(shape=1) +
  geom_smooth(method=lm) # It adds a regression line with a confidence interval of 95%

ggplot(cars, aes(x=speed, y=dist)) +
  geom_point(shape=1) +
  geom_smooth(method=lm, se=FALSE) # It adds a regression line without a confidence interval

ggplot(cars, aes(x=speed, y=dist)) + 
  geom_point(shape=1) +
  geom_smooth() # It adds a smooth line above data and a confidence interval of 95%
## `geom_smooth()` using method = 'loess'

More scatterplots

# Load other data
data(diamonds)
help(diamonds)
head(diamonds)
## # A tibble: 6 x 10
##   carat       cut color clarity depth table price     x     y     z
##   <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
## 3  0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
## 4  0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
## 5  0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
ggplot(diamonds, aes(x=carat, y=price)) + 
  geom_point() # If shape isn't specified, it adds filled circles

ggplot(diamonds, aes(x=carat, y=price)) +
  geom_point(shape=25, size=4) # We can specify the shape and size of points

ggplot(diamonds, aes(x=carat, y=price, colour=cut)) + # Also, we can plot by a factor
  geom_point(size=1.5)

# Even, it can be plot with a gray scale by a value of continue atribute
grayScale <- ggplot(diamonds, aes(x=carat, y=price))
grayScale + geom_point(alpha=1)

grayScale + geom_point(alpha=0.8)

grayScale + geom_point(alpha=0.5)

grayScale + geom_point(alpha=0.3)

# Also, a scatterplot with bins can be do in coloured rectangles
bin <- ggplot(diamonds, aes(x = carat, y = price))
bin + stat_bin2d()

Histograms

We load mpg dataset. We plot a histogram with the highway consumption.

data(mpg)
help(mpg)
head(mpg)
## # A tibble: 6 x 11
##   manufacturer model displ  year   cyl      trans   drv   cty   hwy    fl
##          <chr> <chr> <dbl> <int> <int>      <chr> <chr> <int> <int> <chr>
## 1         audi    a4   1.8  1999     4   auto(l5)     f    18    29     p
## 2         audi    a4   1.8  1999     4 manual(m5)     f    21    29     p
## 3         audi    a4   2.0  2008     4 manual(m6)     f    20    31     p
## 4         audi    a4   2.0  2008     4   auto(av)     f    21    30     p
## 5         audi    a4   2.8  1999     6   auto(l5)     f    16    26     p
## 6         audi    a4   2.8  1999     6 manual(m5)     f    18    26     p
## # ... with 1 more variables: class <chr>
hist(mpg$hwy)

ggplot histograms with options of density curve and setting of bin’s number.

ggplot(mpg, aes(x=hwy)) + 
  geom_histogram(binwidth=3) # It sets the binwidth of each partition

ggplot(mpg, aes(x=hwy)) +
  geom_histogram(bins=30) # It sets the number of partitions

ggplot(mpg, aes(x=hwy)) +
  geom_histogram(binwidth=0.5, colour="red", fill="green") # Colour's options

ggplot(mpg, aes(x=hwy)) +
  geom_density(fill="yellow") # Plot a density curve

# Histogram with a density curve
ggplot(mpg, aes(x=hwy)) +
  geom_histogram(aes(y=..density..), 
                 binwidth=1.2,
                 colour="black", fill="blue") +
  geom_density(alpha=0.3, fill="red")

Also, histograms can be represented with the variable divided respect of several categories. For it, we go to select two manufacturers: audi and volkswagen:

mpg_subset <- mpg[mpg$manufacturer == "audi" | mpg$manufacturer == "volkswagen", ]
str(mpg_subset)
## Classes 'tbl_df', 'tbl' and 'data.frame':    45 obs. of  11 variables:
##  $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
##  $ model       : chr  "a4" "a4" "a4" "a4" ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr  "f" "f" "f" "f" ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr  "p" "p" "p" "p" ...
##  $ class       : chr  "compact" "compact" "compact" "compact" ...
# Overlapped histograms
ggplot(mpg_subset, aes(x=hwy, fill=manufacturer)) + # With fill=manufacturer, we say that the histogram is filled with values of manufacturer
  geom_histogram(binwidth=2, alpha=0.5, position="identity") # position="identity" -> overlap

# Interlayer histograms
ggplot(mpg_subset, aes(x=hwy, fill=manufacturer)) +
  geom_histogram(binwidth=2, position="dodge") # position="dodge" -> interlayer

# Density curves
ggplot(mpg_subset, aes(x=hwy, fill=manufacturer)) +
  geom_density()

ggplot(mpg_subset, aes(x=hwy, fill=manufacturer)) +
  geom_density(alpha=0.4)

More histograms and options

# Histograms more sofisticated 
set.seed(4235)
diamonds_small <- diamonds[sample(nrow(diamonds), 1000), ] # Get a random small subset

ggplot(diamonds_small, aes(x=price)) +
  geom_histogram(binwidth=1000) # We see its histogram 

ggplot(diamonds_small, aes(x=price, ..density.., colour=cut)) +
  geom_freqpoly(binwidth=1000) # Show frequency polinomials with a colours by a factor (cut)

ggplot(diamonds_small, aes(x=price, fill=cut)) +
  geom_histogram(binwidth=1000) # Overlapped histograms with a colours by a factor (cut)

ggplot(diamonds_small, aes(x=price, fill=cut)) +
  geom_density(alpha=0.3) # Overlapped densisty curvers with a colours by a factor (cut)

Gráficos de barras

Cargamos un dataset de propinas para observar su distribución

# install.packages("reshape2")
library(reshape2)
data(tips)
help(tips)
head(tips)
##   total_bill  tip    sex smoker day   time size
## 1      16.99 1.01 Female     No Sun Dinner    2
## 2      10.34 1.66   Male     No Sun Dinner    3
## 3      21.01 3.50   Male     No Sun Dinner    3
## 4      23.68 3.31   Male     No Sun Dinner    2
## 5      24.59 3.61 Female     No Sun Dinner    4
## 6      25.29 4.71   Male     No Sun Dinner    4
# gráfico de barras básico de factura total respecto a hora del día
# se consume más en el almuerzo que en la cena
ggplot(data=tips, aes(x=time, y=total_bill)) +
    geom_bar(stat="identity")

# pintamos la anterior gráfica de barras según la categoría fumador
ggplot(data=tips, aes(x=time, y=total_bill, fill=smoker)) +
    geom_bar(stat="identity")

Se puede observar que el viernes es cuando cada mesa paga más en media.

#cuenta de clientes por día
ggplot(data=tips, aes(x=day)) +
    geom_bar(stat="count") 

#cuenta de media de  facturas por día
ggplot(data=tips, aes(x=day, y=total_bill)) +
    geom_bar(stat = "summary", fun.y = "mean") 

Gráficos de factor con barras de error

Son gráficos parecidos a los de barras pero con líneas que unen puntos. Cargamos un dataset de crecimiento de dientes en cerdos guineanos.

data(ToothGrowth)
help("ToothGrowth")
head(ToothGrowth)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5
## 5  6.4   VC  0.5
## 6 10.0   VC  0.5
tg <- ToothGrowth

Realizamos un sumario de las variables que nos permite realizar representaciones con márgenes de confianza:

# install.packages("Rmisc")
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
tgc <- summarySE(tg, measurevar="len", groupvars=c("supp","dose"))
tgc
##   supp dose  N   len       sd        se       ci
## 1   OJ  0.5 10 13.23 4.459709 1.4102837 3.190283
## 2   OJ  1.0 10 22.70 3.910953 1.2367520 2.797727
## 3   OJ  2.0 10 26.06 2.655058 0.8396031 1.899314
## 4   VC  0.5 10  7.98 2.746634 0.8685620 1.964824
## 5   VC  1.0 10 16.77 2.515309 0.7954104 1.799343
## 6   VC  2.0 10 26.14 4.797731 1.5171757 3.432090

Hacemos gráficas de factor. Podemos observar la diferencia de aplicar al cerdo un supemento u otro. Las barras verticales son intervalos de confianza al 95% de los valores de longitud del diente según la cantidad de dosis aportada en cada suplemeneto.

# factor plot con error bars
ggplot(tgc, aes(x=dose, y=len, colour=supp)) + 
    geom_errorbar(aes(ymin=len-se, ymax=len+se), width=.1) +
    geom_line() +
    geom_point()

# observamos cómo quedaría esta gráfica según propinas por día 
ggplot(tips, aes(x=day, y=tip, colour="red")) + 
    geom_line() +
    geom_point()

## Gráficos de caja ó boxplot

p <- ggplot(diamonds, aes(cut, price))
p + geom_boxplot()

#propina en función de día
#el domingo pagan más en media pero el sábado hay más gente optimista con propinas altas
ggplot(tips, aes(day, tip)) +  geom_boxplot() + coord_flip()